# =============================================================================
# =============================================================================
# Create the cohorts to analyze Safegraph data.
# =============================================================================
# =============================================================================

geo = ts['geog']
freq = ts['freq']
afolder = ''

# =============================================================================
# Import cohorts 
tcp = pd.read_parquet(cdd['p_d_geo_tcp'] + r'\tcp_25k_zip_'+freq+'.parquet')
# Subset the tcp panel to the desired radius around treated zones.
tcps = tcp[(tcp.distance<10000)|((tcp.treated==1)&(tcp.distance<10000))|((tcp.ctype==0)&(tcp.distance_t2cf<25000))].copy()                   #zip codes are within 10km of a plasma center opening (present or counterfactual)
tcps = tcps[(tcps['open'] >= dt.datetime(2018,4,1))&(tcps['open'] <= dt.datetime(2021,4,1))]
tcps = tcps[tcps.ctype>0]

# Keep only cohorts with both treated and control geographies that are strongly treated (within 5km of new center and the change in distance to plasma center is at least 5km)
tcps['goodcohort'] = 1*((tcps.ctype>0)&(tcps.distance<5000)&(tcps.intensity_absdelta_tcf>5000)&(tcps.treated==1))
tcps['goodcohort'] = tcps.groupby('cohort')['goodcohort'].transform('max')
tcps=tcps[(tcps['goodcohort']==1)&(tcps.groupby('cohort')['treated'].transform('max')!=tcps.groupby('cohort')['treated'].transform('min'))]

# =============================================================================
# Merge pattern data with cohort panel.

#Merge with the Safegraph cohorts
pt = pd.read_parquet(cdd['p_d_sg'] + r'\Sample\pattern_cleaned_sample.parquet')

#Merge in the core information
cvars = ['placekey','naics']
core = pd.read_parquet(cdd['p_d_sg']+r'\Sample\core_cleaned_sample.parquet')
pt = pd.merge(pt,core[cvars],how='left',on='placekey')
pt['date'] = pt['date_e']

#Subset to keep only observations that are in the list of naics
core_dtype = pd.read_excel(cdd['p_c'] + r'\3_Analysis\Safegraph_Variables.xlsx',sheet_name='naics')
core_dtype = core_dtype[core_dtype.keep==1][['naics','group']]
core_naics_gps = dict(zip(core_dtype['naics'].tolist(),core_dtype['group'].tolist()))
pt = pt[pt.naics.isin(core_dtype.naics.unique())]


#Keep the ten closest counterfactual areas to the treated area as controls.
tcpss = tcps[(tcps.distance_t2cf_rank<=tcps.groupby(['cohort'])['distance_t2cf_rank'].transform(lambda x: x.drop_duplicates().nsmallest(10).iloc[-1]))].copy()
tcpss = tcpss[tcpss.groupby('cohort')['treated'].transform('sum')>0]
tcpss.info(memory_usage='deep')


#Update the ranking variables to facilitate subsetting.
import random as rn
for v in ['distance_t2cf_rank']:
    tcpss.sort_values(by=['cohort',v],inplace=True)
    tcpss[v] = 1 - tcpss.duplicated(subset=['cohort',v])
    tcpss[v] = tcpss.groupby('cohort')[v].cumsum()-1
#Merge the pt wit tcpss (takes about 10min)
tcpvars = [ 'open', 'cohort_cf', 'distance', 'intensity_absdelta_tcf', 'open_cf', 
           'ctype', 'etime', 'treated', 'distance_t2cf_rank']
pts = pd.merge(pt,tcpss[ldiff(tcpvars,list(pt)) + ['cohort','zip','date']],how='inner',on=['zip','date'])
pts.to_parquet(cdd['p_d_sg_pa'] + afolder + '\pts.parquet')
#Note: because of the random sampling of safegraph data from a single month there are no observations that end up in pts.


# =============================================================================
#Import the controls (also copy the 2019 controls to use in 2020 and 2021)
pts = pd.read_parquet(cdd['p_d_sg_pa'] + afolder + '\pts.parquet')
zip_controls_aug = pd.read_parquet(cdd['p_d_acs_zip'] + r'\acs_2014_2022_s3.parquet')

#Merge in the population density
dens_vars = ['zip', 'year', 'pop2sqkm_wq10']
pts['open_ypre'] = pts['open'].dt.year - 1
pts = pd.merge(pts, zip_controls_aug[dens_vars].rename(columns={'year':'open_ypre'}), how='left', on=['zip','open_ypre'])
pts.drop(columns=['open_ypre'],inplace=True)


#Last minute variable mods.
pts['placekey'] = pts['placekey'].astype('category')
pts['etimeq'] = np.floor(pts['etime'] / 3)
pts.rename(columns={'intensity_absdelta_tcf':'ic'},inplace=True)
pts['id'] = pts.groupby(['placekey']).ngroup()

#Save the data
pts.to_parquet(cdd['p_d_sg_pa'] + afolder+'\ptss.parquet')

#Merge in the population growth in pre-period (i.e. year prior) also mark institutions that have a full pre-period
ptf = pd.read_parquet(cdd['p_d_sg_pa'] + afolder+'\ptss.parquet')
ptf = ptf[(ptf.distance<5000)&(ptf.ic>5000)]
ptf['full_preperiod'] = 1*(ptf.groupby(['cohort','id'])['date'].transform('min')==ptf.groupby(['cohort'])['date'].transform('min'))
ptf['full_postperiod'] = 1*(ptf.groupby(['cohort','id'])['date'].transform('max')==ptf.groupby(['cohort','zip'])['date'].transform('max'))
ptf['neg1present'] = 1*(ptf['etime']==-1)
ptf['neg1present'] = ptf.groupby(['cohort','id'])['neg1present'].transform('max')


#Mark the high volatility and low preperiod visits 
naics_map = pd.read_excel(cdd['p_c'] + '/3_Analysis/Safegraph_Variables.xlsx',sheet_name='naics')
ptf['naics_g'] = ptf['naics'].map(naics_map.set_index('naics')['group']).astype('category')
ptf['naics_sg'] = ptf['naics'].map(naics_map.set_index('naics')['supergroup']).astype('category')

ptf.to_parquet(cdd['p_d_sg_pa'] + afolder+'\ptf.parquet')



